import shutil
import os
import threading
import zipfile
import requests
from contextlib import closing
import lxml.etree as le

# 输出文件夹
out_dir = './output'
# 线程数
thread_num = 20
# http请求超时设置
timeout = 5
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}



def file_transfer_handler(default_dir, target_dir):
    """
    移动文件夹 file_transfer_handler('../SaveFile', '../targetFile')
    :param default_dir: 默认存储路径
    :param target_dir: 目标路径
    :return:
    """
    try:
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        shutil.move(default_dir, target_dir)
        print('移动完成: ', target_dir)
        return True
    except Exception as e:
        print(e)
        return False


# 定义下载函数
def url_response(url):
    path = url.split('/')[-1]
    r = requests.get(url, stream=True)
    with open(path, 'wb') as f:
        for ch in r:
            f.write(ch)
    f.close()


def save_range_handler(start, end, url, filename):
    header = {'Range': 'bytes=%d-%d' % (start, end)}
    with requests.get(url, headers=header, stream=True) as r:
        with open(filename, "r+b") as fp:
            fp.seek(start)
            var = fp.tell()
            fp.write(r.content)


def range_download(url, num_thread=5):
    file_name = url.split('/')[-1]
    r = requests.head(url)
    try:
        file_size = int(r.headers['content-length'])
    except:
        print("检查URL，或不支持对线程下载", url)
        return
    fp = open(file_name, "wb")
    fp.truncate(file_size)
    fp.close()
    part = file_size // num_thread
    for i in range(num_thread):
        start = part * i
        end = file_size if i == num_thread - 1 else start + part
        t = threading.Thread(target=save_range_handler,
                             kwargs={'start': start, 'end': end, 'url': url, 'filename': file_name})
        t.setDaemon(True)
        t.start()
    # 等待所有线程下载完成
    main_thread = threading.current_thread()
    for t in threading.enumerate():
        if t is main_thread:
            continue
        t.join()

    print('%s 下载完成' % file_name)



# os.path.splitext(os.path.basename(f))
def file_name(file_dir):
    file_list = []
    for root, dirs, files in os.walk(file_dir, topdown=False):
        for file in files:
            # file_list.append(os.path.splitext(os.path.basename(file))) # 获取文件名和后缀
            file_list.append(os.path.basename(file))
    return file_list


def download(file_name):
    file_path = os.path.join(out_dir, file_name)
    if os.path.exists(file_path) and zipfile.is_zipfile(file_path):
        return True
    url = f'https://datalake.abuse.ch/malware-bazaar/hourly/{file_name}'
    with closing(requests.get(url, stream=True, headers=headers, timeout=timeout)) as response:
        if response.status_code != 200:
            print('status: %s\t%s' % (response.status_code, url))
            return False
        content_length = int(response.headers.get('content-length', '0'))
        if content_length == 0:
            print('size: 0\t%s' % url)
            return False
        try:
            with open(os.path.join(out_dir, file_name), 'wb') as f:
                for data in response.iter_content(chunk_size=1024):
                    f.write(data)
        except:
            print('save fail\t%s' % url)


def loop(files):
    for f in files:
        download(f)


def download_file_list():
    new_url = 'https://datalake.abuse.ch/malware-bazaar/hourly/'
    response = requests.get(new_url)
    res = response.content.decode('utf-8', 'ignore').replace('\n', '')
    content_xs = le.HTML(res)
    href_xs = content_xs.xpath('//td/a/@href')
    lists = [href_xs[href] for href in range(1, len(href_xs))]
    return lists


if __name__ == '__main__':
    lock = threading.Lock()
    file_list = download_file_list()
    for i in range(0, len(file_list), thread_num):
        t = threading.Thread(target=loop, args=(file_list[i:i + thread_num],))
        t.start()
    fail_list = []
    file_list = file_name('output')
    for file in file_list:
        if not zipfile.is_zipfile('output/' + file):
            fail_list.append(file)
    if fail_list:
        for i in range(0, len(fail_list), thread_num):
            t = threading.Thread(target=loop, args=(fail_list[i:i + thread_num],))
            t.start()
